import scrapy
from chinaNews.items import XinhuanetItem, ArticleItemLoader
from scrapy.http import Request

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36',
}


class XinhuanetSpider(scrapy.Spider):
    name = 'xinhuanet'
    allowed_domains = ['xinhuanet.com', 'news.cn']
    start_urls = ['http://xinhuanet.com/']

    def parse(self, response):
        link2 = response.xpath('//*[@id="focusListNews"]/ul/li/span/a/@href')
        post_urls = link2.extract()
        for post_url in post_urls:
            if post_url.endswith('htm'):
                yield Request(post_url, callback=self.parse_detail, meta={"url": post_url})

        pass

    def parse_detail(self, response):
        title = response.xpath('//div["header.domPC"]//div["head-line"]/h1/span[@class="title"]/text()')
        url = response.meta.get("url", "")
        if not title:
            title = response.xpath('//fjtignoreurl//div["xl-cont-head"]/h1/text()')
        images = response.xpath('//*[@id="detail"]//img/@src').extract()
        newImages = []
        for image in images:
            image = url.rsplit('/', 1)[0:-1][0] + "/" + image
            newImages.append(image)
            # if not image.endswith('gif'):
            #     newImages.append(image)

        item_loader = ArticleItemLoader(item=XinhuanetItem(), response=response)
        item_loader.add_xpath('content', '//*[@id="detail"]')
        item_loader.add_value('title', title.extract())
        item_loader.add_value('url', url)
        item_loader.add_value('images', newImages)
        return item_loader.load_item()


